Analysing NYC Citibike Usage Patterns in 2019

Hypothesis

Citibike ride patterns are significantly influenced by different factors across different seasons

Introduction and Data Description

Problem definition, background and brief literature review. Justify your choice of data and provide sources

# Loading neccessary libraries
library(sf)
library(tmap)
library(tmaptools)
library(dplyr)
library(lubridate)
library(ggplot2)
library(gstat)
library(sp)
Census Data from NYC Dept of City Planning & U.S. Census Bureau

Source:: https://www.nyc.gov/site/planning/planning-level/nyc-population/2020-census.page#2020-census-results

# Read the census shapefile
census <- st_read(dsn="Assignment/Data/nyc2020_census/nyct2020.shp", layer="nyct2020")
## Reading layer `nyct2020' from data source 
##   `/Users/xiaoweilim/Desktop/CEGE0097_SAG/Assignment/Data/nyc2020_census/nyct2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2325 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
# Load the population csv data
population <- read.csv("Assignment/Data/nyc_censusdata_2020.csv")
population$BCT2020 <- as.character(population$BCT2020)

# Join population data to census data
census_joined <- census %>%
  left_join(population, by= c("BoroCT2020"="BCT2020"))

# Plot census polygons
plot(census_joined["CT2020"])

Citibike Data from Lyft (Citibike’s operator, data collected in collaboration with NYC Department of Transportation)

Source:: https://citibikenyc.com/system-data

# Load Citibike data
citibike_jan <- read.csv("Assignment/Data/2019-citibike-tripdata/1_January/201901-citibike-tripdata_1.csv")

# If there are multiple csv files e.g. summer months = more rides
# citibike_1 <- read.csv("Assignment/Data/2019-citibike-tripdata/7_July/201907-citibike-tripdata_1.csv")
# citibike_2 <- read.csv("Assignment/Data/2019-citibike-tripdata/7_July/201907-citibike-tripdata_2.csv")
# citibike_3 <- read.csv("Assignment/Data/2019-citibike-tripdata/7_July/201907-citibike-tripdata_3.csv")
# citibike_july <- bind_rows(citibike_1, citibike_2, citibike_3)

Data Wrangling: Cleaning up the Date/Time formatting

# Convert starttime and stoptime into DateTime format
citibike_jan$starttime <- ymd_hms(citibike_jan$starttime)
citibike_jan$stoptime <- ymd_hms(citibike_jan$stoptime)

# Cleaning up for weekday/weekend and hour information
citibike_jan <- citibike_jan %>%
  mutate(
    # Weekday or weekend
    is_weekend = ifelse(wday(starttime) %in% c(1,7), TRUE, FALSE),
    # Start time hour
    start_hour = hour(starttime)
  )

Data Wrangling: Station-Oriented Dataset

The Citibike dataset is currently organised transactionally, with each ride representing one transaction. The raw data is transformed to a station-oriented one, where it has source attributes like id/lat/long, ride counts, and user counts. Additional attributes are computed, such as: i. Time and Day (Weekday or Weekend) these rides start from ii. End Ride Count, which sums the number of bikes returned/docked back at this particular station iii. Median Trip Duration for ride originating from the station or ending at the station

# Create the station dataset with attributes on station id/lat/long, ride counts, user counts and time-based counts
start_station_jan <- citibike_jan %>%
  # Group by start.station and user type
  group_by(start.station.id, start.station.latitude, start.station.longitude) %>%
  summarise(
    ride_start_count = n(),
    usertype_subscriber_count = sum(usertype == "Subscriber"),
    usertype_customer_count = sum(usertype == "Customer"),
    
    # Weekday counts for time intervals AM/PM Peak/Off-Peak - ref Liu et al
    ride_start_weekday_7_10 = sum(!is_weekend & start_hour>=7 & start_hour<10),
    ride_start_weekday_10_17 = sum(!is_weekend & start_hour>=10 & start_hour<17),
    ride_start_weekday_17_0 = sum(!is_weekend & start_hour>=17 & start_hour<24),
    ride_start_weekday_0_7 = sum(!is_weekend & start_hour>=0 & start_hour<7),
    
    # Weekend counts for leisure/others - ref Liu et al
    ride_start_weekend_10_0 = sum(is_weekend & start_hour>=10 & start_hour<24),
    ride_start_weekend_0_10 = sum(is_weekend & start_hour>=0 & start_hour<10)
  ) %>%
  rename(
    station_id = start.station.id,
    station_latitude = start.station.latitude,
    station_longitude = start.station.longitude
  )

# Calculate end station counts  
end_station_jan <- citibike_jan %>%
  group_by(end.station.id, end.station.latitude, end.station.longitude) %>%
  summarise(
    ride_end_count = n()
  ) %>%
  rename(
    station_id = end.station.id,
    station_latitude = end.station.latitude,
    station_longitude = end.station.longitude
  )

# Combine start and end to return ride activity by station
station_jan <- full_join(start_station_jan, end_station_jan,
                          by= c("station_id", "station_latitude", "station_longitude")) %>%
  # Deal with NA values
  mutate(
    ride_start_count = ifelse(is.na(ride_start_count), 0, ride_start_count),
    ride_end_count = ifelse(is.na(ride_end_count), 0, ride_end_count),
    ride_activity = ride_start_count + ride_end_count
  )

# Add median trip duration for start stations
start_station_duration <- citibike_jan %>%
  group_by(start.station.id) %>%
  summarise(median_trip_duration_start = median(tripduration, na.rm=TRUE)) %>%
  rename(station_id = start.station.id)

# Add median trip duration for end stations
end_station_duration <- citibike_jan %>%
  group_by(end.station.id) %>%
  summarise(median_trip_duration_end = median(tripduration, na.rm=TRUE)) %>%
  rename(station_id = end.station.id)

# Merge median trip duration with station_july
station_jan <- station_jan %>%
  left_join(start_station_duration, by= "station_id") %>%
  left_join(end_station_duration, by= "station_id")

Plotting the aggregated Citibike Station data

# Convert cleaned July dataset into sf object
station_jan_sf <- station_jan %>%
  st_as_sf(coords = c("station_longitude", "station_latitude"), crs=4326, remove=FALSE)

# Plot map of points
tm_shape(station_jan_sf) +
  tm_bubbles(size=0.1, col="ride_activity", palette="YlOrRd") +
  tm_layout(main.title = "Citibike Jan 2019",
            main.title.size=1.5, frame=FALSE, legend.position = c("left", "top"))

Creating a Census Tract-Aggregated Citibike Usage dataset

Combining the Census and Citibike Station data

# Convert Census polygon crs4269 to crs4329
census_joined <- st_transform(census_joined, crs=st_crs(station_jan_sf))

# Plot interim data
tm_shape(census) +
  tm_borders() +
  tm_shape(station_jan_sf) +
  tm_bubbles(size=0.1, col="ride_activity", palette="YlOrRd") + 
  tm_layout(main.title = "Citibike Jan 2019 and Census Polygons",
            main.title.size=1, frame=FALSE, legend.position= c("left", "top"))

# Creating an aggregated station dataset at census tract level
station_jan_nyc <- station_jan_sf %>%
  # Left join to remove stations outside NYC (do not have census tract)
  st_join(census_joined, join= st_within, left=FALSE)
ct_station_jan <- station_jan_nyc %>%
  group_by(BoroCT2020) %>%
  summarise(
    num_stations = n(),
    total_ride_start_count = sum(ride_start_count, na.rm=TRUE),
    total_ride_end_count = sum(ride_end_count, na.rm=TRUE),
    total_ride_activity = sum(ride_activity, na.rm=TRUE),
    median_trip_duration_start = median(median_trip_duration_start, na.rm=TRUE),
    median_trip_duration_end = median(median_trip_duration_end, na.rm=TRUE),
    usertype_subscriber_count = sum(usertype_subscriber_count, na.rm=TRUE),
    usertype_customer_count = sum(usertype_customer_count, na.rm=TRUE),
    # Adding in the time factor
    weekday_7_10 = sum(ride_start_weekday_7_10, na.rm=TRUE),
    weekday_10_17 = sum(ride_start_weekday_10_17, na.rm=TRUE),
    weekday_17_0 = sum(ride_start_weekday_17_0, na.rm=TRUE),
    weekday_0_7 = sum(ride_start_weekday_0_7, na.rm=TRUE),
    weekend_10_0 = sum(ride_start_weekend_10_0, na.rm=TRUE),
    weekend_0_10 = sum(ride_start_weekend_0_10, na.rm=TRUE)
  )

# Converting to df to allow for inner join and drop column for polygons without Citibike docks
ct_station_jan_df <- as.data.frame(ct_station_jan)
agg_citibike_jan <- census_joined %>%
  inner_join(ct_station_jan_df, by= "BoroCT2020")

# colnames(agg_citibike_jan)
# agg_citibike_jan <- agg_citibike_jan_copy

# List of columns to convert from character to integer
columns_to_convert <- c("Pop1", "PopU5", "Pop5t9", "Pop10t14", "Pop15t19", "Pop20t24", "Pop25t29", "Pop30t34", "Pop35t39", "PopU18", "Pop65pl", "GQClgHsg", "Fam", "HUnits")
# Note: chr values with comma (e.g. "3,512") automatically converts to NA -- therefore need to remove the comma
agg_citibike_jan[columns_to_convert] <- lapply(agg_citibike_jan[columns_to_convert], function(x) {
  x <- gsub(",", "", x)             # Remove commas
  x <- as.numeric(x)                # Convert to numeric
  x[is.na(x)] <- 0                  # Replace NAs with 0
  return(x)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in `[<-.data.frame`(`*tmp*`, columns_to_convert, value = list(Pop1 =
## c(11616, : provided 15 variables to replace 14 variables
# Replace NA with 0 for all numeric and integer columns
agg_citibike_jan[sapply(agg_citibike_jan, is.numeric)] <- 
  lapply(agg_citibike_jan[sapply(agg_citibike_jan, is.numeric)], function(x) {
    x[is.na(x)] <- 0
    return(x)
  })
## Warning in `[<-.data.frame`(`*tmp*`, sapply(agg_citibike_jan, is.numeric), :
## provided 48 variables to replace 47 variables
# Verify if there are any remaining NAs
sum(is.na(agg_citibike_jan))
## [1] 416
# Create a new column 'Pop19t64' as Pop1 - PopU18 - Pop65pl
agg_citibike_jan$Pop19t64 <- agg_citibike_jan$Pop1 - agg_citibike_jan$PopU18 - agg_citibike_jan$Pop65pl

# Verify the result
head(agg_citibike_jan[, c("Pop1", "PopU18", "Pop65pl", "Pop19t64")])
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -73.99481 ymin: 40.70913 xmax: -73.9733 ymax: 40.72943
## Geodetic CRS:  WGS 84
##    Pop1 PopU18 Pop65pl Pop19t64                     geometry.x
## 1 11616   1723    3019     6874 MULTIPOLYGON (((-73.99022 4...
## 2  3543    679     784     2080 MULTIPOLYGON (((-73.98837 4...
## 3  7934    845    1201     5888 MULTIPOLYGON (((-73.98985 4...
## 4  6969    878    1333     4758 MULTIPOLYGON (((-73.97875 4...
## 5  3609    489     488     2632 MULTIPOLYGON (((-73.97689 4...
## 6  6819    829    1342     4648 MULTIPOLYGON (((-73.9733 40...
# # Verify if there are any remaining NAs and list the columns with NAs
# remaining_na_cols <- sapply(agg_citibike_jan, function(x) any(is.na(x)))
# 
# # Print out the columns with remaining NAs
# if (any(remaining_na_cols)) {
#   cat("Columns with remaining NAs:\n")
#   print(names(remaining_na_cols[remaining_na_cols == TRUE]))
# } else {
#   cat("No remaining NAs in the numeric columns.\n")
# }


# Plotting the map with context
tm_shape(census_joined)+
  tm_borders(col= "white")+
  tm_fill(col= "lightgrey")+
  tm_shape(agg_citibike_jan)+
  tm_polygons(col= "total_ride_start_count",
              style= "quantile",
              palette= "YlOrRd",
              title= "Total Ride Start Count")+
  tm_layout(title= "Citibike Ride Activity by Census Tract (Jan 2019)",
            title.position= c("left", "top"),
            legend.position= c("left", "top"),
            legend.outside= FALSE,
            frame= FALSE)

Exploratory Spatial Data Analysis

Summarise the properties of your data

# Plotting the map with context
tm_shape(census_joined)+
  tm_borders(col= "white")+
  tm_fill(col= "lightgrey")+
  tm_shape(agg_citibike_jan)+
  tm_polygons(col= "median_trip_duration_start",
              style= "quantile",
              palette= "RdPu",
              title= "Median Trip Duration")+
  tm_layout(title= "Citibike Ride Activity by Census Tract (Jan 2019)",
            title.position= c("left", "top"),
            legend.position= c("left", "top"),
            legend.outside= FALSE,
            frame= FALSE)

# Plot histogram
ggplot(data=agg_citibike_jan, aes(total_ride_activity)) + geom_histogram()

#####Spatial Autocorrelation Test

Global Moran’s I

library(spdep)
library(knitr)
nb <- poly2nb(agg_citibike_jan)
## Warning in poly2nb(agg_citibike_jan): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(agg_citibike_jan): neighbour object has 6 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
W <- nb2mat(nb, style='W', zero.policy = TRUE)
colnames(W) <- rownames(W)

# Creating a weights list
Wl <- nb2listw(nb, zero.policy = TRUE)
moran(agg_citibike_jan$total_ride_start_count, Wl, n=length(Wl$neighbours), S0=Szero(Wl))
## $I
## [1] 0.4743091
## 
## $K
## [1] 11.69548
# Test under randomisation
moran.test(agg_citibike_jan$total_ride_start_count, Wl)
## 
##  Moran I test under randomisation
## 
## data:  agg_citibike_jan$total_ride_start_count  
## weights: Wl  
## n reduced by no-neighbour observations  
## 
## Moran I statistic standard deviate = 14.844, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.469748398      -0.002433090       0.001011878
# Test using Monte-Carlo simulation
moran.mc(agg_citibike_jan$total_ride_start_count, Wl, nsim=999)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  agg_citibike_jan$total_ride_start_count 
## weights: Wl  
## number of simulations + 1: 1000 
## 
## statistic = 0.46975, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

At the census tract level, total ride start counts display significant, positive autocorrelation

Moran’s Scatterplot

# Moran Scatterplot
moran.plot(agg_citibike_jan$total_ride_start_count, Wl, xlab='Total Ride Start Count', ylab='Spatially Lagged Ride Start Count', labels=agg_citibike_jan$NTAName)

[add in finding - Central Park, most values in low-low]

Gi and Gi*

# Gi and Gi*
# Based 1.6km roughly off the width of 2 Manhattan blocks - also see comparison of 4km vs 2km in commented block below

NbrL <- dnearneigh(st_centroid(agg_citibike_jan), 0, 1.6)
## Warning: st_centroid assumes attributes are constant over geometries
## Warning in dnearneigh(st_centroid(agg_citibike_jan), 0, 1.6): neighbour object
## has 2 sub-graphs
# Creating a list of neighbours not including self
D <- nb2listw(NbrL, style='B')
# Creating a list of neighbours, but include self
D_star <- nb2listw(include.self(NbrL), style='B')

G <- globalG.test(agg_citibike_jan$total_ride_start_count, D)
G <- globalG.test(agg_citibike_jan$total_ride_start_count, D_star)

# Check if there are any isolated observations (no neighbors) by returning the length (number of neighbours) for each polygon
sapply(NbrL, length)
##   [1] 29 31 39 33 32 32 29 39 28 37 45 48 26 23 41 45 39 44 41 42 41 42 47 44 40
##  [26] 38 32 31 37 33 33 43 31 32 34 37 41 28 42 38 28 39 36 29 40 42 41 33 24 32
##  [51] 40 38 39 32 41 40 37 31 43 29 28 43 28 35 33 33 42 31 33 33 40 32 26 30 22
##  [76] 24 29 27 28 27 29 24 23 27 25 23 27 21 26 17 28 21 31 17 21 26 21 32 34 18
## [101] 26 23 19 18 21 22 22 17 19 34 33 26 22  7  9  6 12 13 11 17 21 21 24 25 27
## [126] 25 21 23 28 27 23 28 30 28 23 21 20 18 15 16 16 13 18 20 15 20 11 10  3 42
## [151] 20 41 33 39 20 25 29 30 35 31 20  9  6 10 18 10 23 20 20 25 38 31 30 27 31
## [176] 14 29 18 22 29 28 27 27 31 16 14 40 45 47 42 28 30 35 36 29 27 25 28 29 28
## [201] 30 32 34 41 28 24 17 15 25 25 28 12 32 43 40 20 29 19 16 22 17 18 24 20 22
## [226] 33 18 19 26 26 26 28 26 38 40 37 41 40 40 43 45 38 30 24  6 25 27 37 36 44
## [251] 13 36 26 45 44 40 40 35 29 22 17 15 15 22 26 31 37 36 37 37 38 35 31 30 36
## [276] 40 39 35 36 34 32 25 16 21 18 25 31 35 36 31 31 32 30 33 30 28 29 30 27 28
## [301] 29 28 23 13 15 19 24 26 28 25 20 10 27 10 13 19 22 23 23 24 28 25 26 28 29
## [326] 28 25 26 25 25 29 30 20 23 28 27 15 21 16 18 31 31 36 29 24 19 28 31 21 22
## [351] 14 25 16 30 24 25 24 18 27 27  8 19 29 27 17 17 30 35 24 31 31 12 31 14 10
## [376] 24 32 33 25 17 22 25 26 13 13 32 30 24 21 14 31 35 11 11 12 25 20 18 24 27
## [401] 25 28 14 21 18 18 19 17 16  1 45 44 32 33 33 34
# > Compare to: sapply(NbrL, length) when NbrL=4
# [1] 163 166 155 170 167 174 124 166 116 170 153 154 111 105 167 161 139 143 133 154 146 136 144 151 137 133
# [27] 120 174 131 167 122 151 123 170 166 137 161 126 154 142 127 142 169 130 163 145 159 131 115 167 145 161
# [53] 156 170 144 154 163 165 148 166 167 149 163 136 130 160 150 165 133 161 148 133 132 147 119 128 154 150
# [79] 147 142 139 132 131 122 127 130 125 129 115 115 109 123 107 112 118 104 110  89  91  94  88  79  78  66
# [105]  68  70  66  57  57  87  80  71  62  15  15  15  15  15  15 155 107 116 118 111 106 101  95  90  85  84
# [131]  91  93  96 105 106 129  89  83  79  74  65  56  64  67  59  80  98  98  15 159 122 163 169 164  98  90
# [157] 164 158 151 171  78  15  15  15 113  15  76 162 174  96 148 133 130 177 169  67 174 168 114 161 156 153
# [183]  78  84 115 110 161 143 147 157 168 166 126 132 166 166 164 164 162 160 152 163 167 160  95  94 107  99
# [209]  69  76  77  15 168 155 159 110 136 113 134 137 161 135 114 143 119 130 119 115 151 138 140 146 127 162
# [235] 163 163 153 140 150 137 140 139 125 107  66  93 106 128 118 125  77 103  88 128 128 129 117 105  98  92
# [261]  88  84  87  96 100 104 111 124 121 139 149 158 170 166 149 145 131 129 120 117 112 174  97 104  97 105
# [287] 121 127 134 132 124 115 111 118 124 130 117 112 111 112 111 107  98  79  83  89  93  97  96 104  87  68
# [313] 115  86  91  98  95  96 105 112 119 112 119 127 125 139 138 142 159 160 142 144 173 173 157 155 158 114
# [339] 147  99 141 144 109  96 122 113 178 177  94  95  84 130  97 162 116 115 178  83 177 172  81 125 149 172
# [365] 164 142 166 160 125 138 133  86 173  58  69 115 130 134 138  84 112 124 123  15  15 128 120 116 107  83
# [391] 173 176  15  15  96 175  86  73 168 166 118 120 105 111  86  83 143 151  92  40 155 146 172 174 144 153

Gi <- localG(agg_citibike_jan$total_ride_start_count, D)
agg_citibike_jan$Gi <- Gi
Gi_star <- localG(agg_citibike_jan$total_ride_start_count, D_star)
agg_citibike_jan$Gi_star <- Gi_star

tm_shape(agg_citibike_jan) + tm_polygons(col='Gi', palette='-RdBu', style='quantile')

tm_shape(agg_citibike_jan) + tm_polygons(col='Gi_star', palette='-RdBu', style='quantile')

[add in finding - Midtown Manhattan clusters]

Local Moran’s I

# Local Moran's I
Ii <- localmoran(agg_citibike_jan$total_ride_start_count, Wl)
agg_citibike_jan$Ii <- Ii[,'Ii']
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii', palette='-RdBu', style='quantile')

# Adjusting the p-value
agg_citibike_jan$Iip_unadjusted <- Ii[,'Pr(z != E(Ii))']
agg_citibike_jan$Ii_un_sig <- 'nonsignificant'
agg_citibike_jan$Ii_un_sig[which(agg_citibike_jan$Iip_unadjusted < 0.05)] <- 'significant'
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii_un_sig', palette='-RdBu')

# Adjusting with the Bonferroni method
agg_citibike_jan$Iip_adjusted <- p.adjust(agg_citibike_jan$Iip_unadjusted, method='bonferroni')
agg_citibike_jan$Ii_ad_sig <- 'nonsignificant'
agg_citibike_jan$Ii_ad_sig[which(agg_citibike_jan$Iip_adjusted < 0.05)] <- 'significant'
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii_ad_sig', palette='-RdBu')

# Looking for clusters from local Moran
moranCluster <- function(shape, W, var, alpha=0.05, p.adjust.method='bonferroni')
{
  # Code adapted from https://rpubs.com/Hailstone/346625
  Ii <- localmoran(shape[[var]], W)
  shape$Ii <- Ii[,"Ii"]
  Iip <- p.adjust(Ii[,"Pr(z != E(Ii))"], method=p.adjust.method)
  shape$Iip <- Iip
  shape$sig <- shape$Iip<alpha
  # Scale the data to obtain low and high values
  shape$scaled <- scale(shape[[var]]) # high low values at location i
  shape$lag_scaled <- lag.listw(Wl, shape$scaled) # high low values at neighbours j
  shape$lag_cat <- factor(ifelse(shape$scaled>0 & shape$lag_scaled>0, "HH",
                                 ifelse(shape$scaled>0 & shape$lag_scaled<0, "HL",
                                        ifelse(shape$scaled<0 & shape$lag_scaled<0, "LL",
                                               ifelse(shape$scaled<0 & shape$lag_scaled<0, "LH", "Equivalent")))))
  shape$sig_cluster <- as.character(shape$lag_cat)
  shape$sig_cluster[!shape$sig] <- "Non-sig"
  shape$sig_cluster <- as.factor(shape$sig_cluster)
  results <- data.frame(Ii=shape$Ii, pvalue=shape$Iip, type=shape$lag_cat, sig=shape$sig_cluster)
  
  return(list(results=results))
}

clusters <- moranCluster(agg_citibike_jan, W=Wl, var='total_ride_start_count')$results
agg_citibike_jan$Ii_cluster <- clusters$sig

tm_shape(agg_citibike_jan) + tm_polygons(col='Ii_cluster')

Methodology

Describe the method you are using to analyse the data

OLS Linear Model

# Testing for spatial autocorrelation in regression errors
janbike.lm <- lm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + num_stations, data=agg_citibike_jan)
# Saving the residuals as a column
agg_citibike_jan$lm.res <- residuals(janbike.lm)
# Plotting the residuals
tm_shape(agg_citibike_jan)+tm_polygons('lm.res', palette='-RdBu', style='quantile')

# # Testing for residual autocorrelation
# # Created a weights list for the Global Moran's I calculation earlier
# nb <- poly2nb(agg_citibike_jan)
# Wl <- nb2listw(nb, zero.policy = TRUE)
# 
# # Created a weights list based off distance for the Gi and Gi* calculation earlier
# NbrL <- dnearneigh(st_centroid(agg_citibike_jan), 0, 1.6)
# D <- nb2listw(NbrL, style='B')

lm.morantest(janbike.lm, D)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan)
## weights: D
## 
## Moran I statistic standard deviate = 28.714, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.3328527652    -0.0060070098     0.0001392648
lm.LMtests(janbike.lm, D, test='RLMlag')
## Warning in lm.RStests(model = model, listw = listw, zero.policy = zero.policy,
## : Spatial weights matrix not row standardized
## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan)
## test weights: listw
## 
## adjRSlag = 158.17, df = 1, p-value < 2.2e-16
lm.LMtests(janbike.lm, D, test='RLMerr')
## Warning in lm.RStests(model = model, listw = listw, zero.policy = zero.policy,
## : Spatial weights matrix not row standardized
## 
##  Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
##  dependence
## 
## data:  
## model: lm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan)
## test weights: listw
## 
## adjRSerr = 322.01, df = 1, p-value < 2.2e-16

Spatial Lag Regression

# Load regression libraries
library(spgwr)
library(spatialreg)

# Spatial Lag Regression
janbike.lag <- lagsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + num_stations, data=agg_citibike_jan, listw=D)
## Warning in lagsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 2.39674e-20 - using numerical Hessian.
## Warning in sqrt(diag(fdHess)[-1]): NaNs produced
summary(janbike.lag)
## 
## Call:lagsarlm(formula = total_ride_start_count ~ PopU18 + Pop19t64 + 
##     Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + 
##     num_stations, data = agg_citibike_jan, listw = D)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3681.96  -670.06    37.82   635.87  9969.37 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##                 Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  -2.5150e+03  5.9088e+02 -4.2563 2.078e-05
## PopU18        9.1995e-02  4.4801e-01  0.2053  0.837307
## Pop19t64     -1.3586e-01  8.8707e-02 -1.5315  0.125640
## Pop65pl      -4.2706e-01  2.7530e-01 -1.5513  0.120837
## HUnits        5.4746e-01  1.8049e-01  3.0331  0.002421
## Fam          -2.8655e-01  6.9216e-01 -0.4140  0.678876
## AvgHHSz       1.8653e+02  2.4386e+02  0.7649  0.444325
## GQClgHsg      1.4038e-02         NaN     NaN       NaN
## Shape_Area    3.9809e-05  3.7186e-05  1.0705  0.284377
## num_stations  1.2015e+03  7.4877e+01 16.0469 < 2.2e-16
## 
## Rho: 0.0227, LR test value: 247.51, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.0013363
##     z-value: 16.987, p-value: < 2.22e-16
## Wald statistic: 288.57, p-value: < 2.22e-16
## 
## Log likelihood: -3612.249 for lag model
## ML residual variance (sigma squared): 1995200, (sigma: 1412.5)
## Number of observations: 416 
## Number of parameters estimated: 12 
## AIC: 7248.5, (AIC for lm: 7494)
agg_citibike_jan$lag.res <- residuals(janbike.lag)
tm_shape(agg_citibike_jan) + tm_polygons('lag.res', palette='-RdBu', style='quantile')

Spatial Error Regression

# Spatial Error Regression
janbike.err <- errorsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + num_stations, data=agg_citibike_jan, listw=D)
## Warning in errorsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 3.77442e-20 - using numerical Hessian.
summary(janbike.err)
## 
## Call:errorsarlm(formula = total_ride_start_count ~ PopU18 + Pop19t64 + 
##     Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + 
##     num_stations, data = agg_citibike_jan, listw = D)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3636.986  -675.220    99.102   545.406  9844.031 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                 Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  -2.1134e+03  5.2780e+02 -4.0041 6.225e-05
## PopU18       -1.4544e-01  3.4930e-01 -0.4164   0.67713
## Pop19t64     -2.0265e-01  2.0792e-01 -0.9746   0.32973
## Pop65pl      -5.3638e-01  3.2736e-01 -1.6385   0.10132
## HUnits        4.8465e-01  2.5749e-01  1.8822   0.05981
## Fam           3.3913e-01  6.3178e-01  0.5368   0.59141
## AvgHHSz       2.1974e+02  2.1829e+02  1.0066   0.31412
## GQClgHsg      3.1507e-02  2.8754e-01  0.1096   0.91275
## Shape_Area    3.1857e-05  3.6138e-05  0.8815   0.37803
## num_stations  1.2217e+03  7.3690e+01 16.5793 < 2.2e-16
## 
## Lambda: 0.026931, LR test value: 245.47, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.00067988
##     z-value: 39.611, p-value: < 2.22e-16
## Wald statistic: 1569.1, p-value: < 2.22e-16
## 
## Log likelihood: -3613.266 for error model
## ML residual variance (sigma squared): 1967600, (sigma: 1402.7)
## Number of observations: 416 
## Number of parameters estimated: 12 
## AIC: 7250.5, (AIC for lm: 7494)
agg_citibike_jan$err.res <- residuals(janbike.err)
agg_citibike_jan$err.fit <- exp(fitted.values(janbike.err))

tm_shape(agg_citibike_jan) + tm_polygons('err.res', palette='-RdBu', style='quantile')

Results

Present the results of your individual section

Discussion and Conclusions

Combine and discuss results here (e.g. for final site selection, or to discuss the relative merits of approaches). Include limitations and future work

+++++++

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.